# Preparing the undergrad validated tweets for analysis
# Nora Webb Williams
# Started 10/21/2019

#prelim
library(tidyverse)
library(lubridate)

# read in the initial set of tweets
raw_tweets <- read_tsv("data/gun_org_data_from_2017_01_01_to_2018_04_01.tsv",
                       col_types = cols(
                         .default = col_character(),
                         index = col_integer(),
                         master_id = col_character(),
                         user_id = col_character(),
                         tweet_id = col_character(),
                         tweet_epochtime = col_integer(),
                         initial_retweets = col_integer(),
                         initial_favorites = col_integer(),
                         delayed_retweets = col_double(),
                         delayed_favorites = col_double(),
                         retweet_percent = col_double(),
                         num_followers = col_integer(),
                         gun_control = col_integer()
                       ))

## Just keeping the retweet info and image dums
# and add a variable for the day
tweet_retweets <- raw_tweets %>% 
  mutate(pretty_date = as_datetime(tweet_epochtime),
         ESTdate = with_tz(pretty_date, tzone = "America/New_York")) %>% # Create a date/time object
  mutate(tweet_dayfloor = as.Date(floor_date(ESTdate, "day"))) %>% #Easier if this is date only
  select(tweet_id, user_id, name, tweet_dayfloor, tweet_epochtime, num_followers, initial_retweets, initial_favorites, 
         delayed_retweets, delayed_favorites, retweet_status,
         images_list, video_id, pretty_date,
         # Save the tweet text so that you can confirm later if the merges all work
         orig_tweet_text = tweet_text) %>% 
  filter(tweet_epochtime >= 1517472000 & tweet_epochtime <= 1522566000) %>% 
  #filter(!str_detect(tweet_id, "E"),
  #       !str_detect(tweet_id, "e")) %>% 
  unique()

# are there tweets here that have the issue with lots of 000s?
# Any that were messed up, or can I use this to fix the broken tweet_ids? 
suspect_tweets_rts <- tweet_retweets %>% 
  filter(str_detect(tweet_id, "0000")) # only 24 here, not the 64. So perhaps some of these are fixed?!

suspect_tweets_rts_ids <- suspect_tweets_rts %>% 
  select(tweet_id) %>% 
  pull()

### For text match later, 
# subset just the text and the id to see if I can match up later
orig_tweets_ids <- tweet_retweets %>% 
  select(orig_tweet_text, orig_tweet_id = tweet_id)

# Followers over time
# Read in the ts position summaries (change in followers by day by gun position)
#ts_position_summaries <- read_csv("results-data/ts_position_summaries.csv")
imputed_parkland <- read_csv("results-data/expanded_imputed_followers_around_parkland.csv",
                             col_types = cols(
                               name = col_character(),
                               user_id = col_character(),
                               gun_control = col_character(),
                               tweet_dayfloor = col_date(format = ""),
                               avg_followers = col_double(),
                               imputed_followers = col_double()
                             ))

# Formatting
followers_formatted  <- imputed_parkland %>% 
  group_by(gun_control, tweet_dayfloor) %>% 
  summarise(num_groups = n(),
            num_followers = sum(imputed_followers),
            avg_followers = num_followers/num_groups) %>% 
  filter(!is.na(tweet_dayfloor))


# Name and gun position
slim_name_gun_control <- raw_tweets %>% 
  select(name, gun_control) %>% 
  unique()


## Read in the validated data
raw_validated <- read_csv("data/Tweets_Data_Final_2_0.csv",
                          col_types = cols(
                            .default = col_character(),
                            tweet_id = col_character(),
                            tweet_text = col_character(),
                            is_valid = col_character(),
                            Topic = col_character(),
                            is_valid_2 = col_character(),
                            is_valid_3 = col_character(),
                            Score = col_character(),
                            Topic_Model_Score = col_double()
                            )
                          )

## how many of these have suspiciously round tweet ids?
suspect <- raw_validated %>% 
  filter(str_detect(tweet_id, "0000"))

write_csv(suspect, "data/suspect_tweet_ids.csv")

## list the suspect tweet_ids

suspect_ids <- suspect %>% 
  select(tweet_id) %>% 
  pull()

# Are they all unique? Nope
length(suspect_ids)
length(unique(suspect_ids))

print(suspect_ids)
print(suspect_tweets_rts_ids)

### for each text in the suspect file, want to look up that text in the tweets/retweets
# and match to that id (assuming there is only one match!)

text_match <- suspect %>% 
  left_join(orig_tweets_ids, 
            by = c("tweet_text" = "orig_tweet_text")) %>% 
  arrange(tweet_id) 
  

### After that merge, which tweets still don't have a match? 
# 19
unmatched_tweets <- text_match %>% 
  filter(is.na(orig_tweet_id))

write_csv(unmatched_tweets, "data/unmatched_tweets.csv")

## Read in the manually checked match for those 19 tweets
# (see Google drive)

checked_unmatched <- read_csv("data/checked_unmatched_tweets.csv",
                              col_types = cols(
                                .default = col_character(),
                                tweet_id = col_character(),
                                tweet_text = col_character(),
                                is_valid = col_character(),
                                Topic = col_character(),
                                is_valid_2 = col_character(),
                                is_valid_3 = col_character(),
                                Score = col_character(),
                                Topic_Model_Score = col_double(),
                                orig_tweet_id = col_character(),
                                quoted_orig_tweet_id = col_character())) %>% 
  select(-orig_tweet_id) %>% # drop this column because Google doc can't export not as sci 
  # delete the quotes to get orig_tweet_it
  mutate(orig_tweet_id = str_replace_all(quoted_orig_tweet_id, '"', '')) %>% 
  select(-quoted_orig_tweet_id) # drop the quoted column


##### from the text match
## And which are the ones that now do have a match? 
# 50
matched_tweets <-  text_match  %>% 
  filter(!is.na(orig_tweet_id))

## Version that keeps the new tweet_id and drops the old one
matched_tweets_clean <- matched_tweets %>% 
  # add in the additional manual matched tweets
  bind_rows(checked_unmatched) %>% #gets to the 69 matched tweet attempts
  select(-tweet_id) %>% 
  rename(tweet_id = orig_tweet_id) %>% 
  # drop those where I couldn't find an id after a manual Twitter search
  filter(!is.na(tweet_id))
  


### FIXED: Kevin went back and added back in the deleted topics
# And what about the ones where the topic is NA?
#topic_na <- raw_validated %>% 
#  filter(is.na(Topic))

#write_csv(topic_na, "data/topic_na.csv")

#### Processing the master validated data, removing funky bits
length(unique(raw_validated$tweet_id)) #5072

# Drop tweet_ids that have been ruined
master_no_funky <- raw_validated %>% 
  filter(!str_detect(tweet_id, "E"), # drop obs with odd/suspect tweet_ids
         !str_detect(tweet_id, "e"),
         !str_detect(tweet_id, "0000")) %>% 
  bind_rows(matched_tweets_clean) %>%  # add back in 50 re-matched tweets with odd/suspect tweet_ids
  # make all the Scores lower case %>% 
  mutate(Score = str_to_lower(Score)) %>% 
  # Need to clean the topic names to match what's in paper/prior work
  mutate(Topic = case_when(Topic == "Gun Free Zone" ~ "Gun Free Zones",
                                 Topic == "Remembrance" ~ "Remembrances",
                                 Topic == "School Safety" ~ "School Security",
                                 TRUE ~ Topic)) %>% 
  select(tweet_id, tweet_text, 
         Score, Topic,
         Topic_Model_Score) %>% 
  arrange(tweet_id) %>% 
  unique() # get rid of single duplicate, which somehow does still exist?


# 
length(unique(master_no_funky$tweet_id)) #5066 (tweets have multiple)

### Want to reshape the data so it's 
# ONE ROW PER TWEET, value under each topic column is the probability of that topic 
# IF that topic was considered valid for it...
# SO the issue with this one is that we have new valiadated tweets, so their original probability isn't here
slim_validated_tweets_with_probs <- master_no_funky %>% 
  filter(Score == 'y') %>%  #only keep the tweets that were valid on that given topic
  select(tweet_id, tweet_text, Topic, Topic_Model_Score) %>% 
  arrange(tweet_id) %>%
  unique() %>%  # very hacky way to solve the duplicate entery issue?
  spread(key = Topic, value = Topic_Model_Score)



######################################
######################################
# Merging the data together

master_new <- left_join(tweet_retweets, 
                        slim_validated_tweets_with_probs, by = "tweet_id") %>% 
  # merge with the info about the account
  left_join(slim_name_gun_control, by = "name") %>% 
  # save the original tweet text
  select(-tweet_text) %>% 
  rename(tweet_text = orig_tweet_text)
  

# Write that out for basic regression analysis
write_csv(master_new, "results-data/tweets_with_topic_probs.csv")

######################################
######################################
# Summaries and figures!!!!
# should really be a new script
######################################
######################################

### 
gathered_tweet_topics <- master_new %>% 
  gather(`Assault Weapons`:`School Shootings`, key = "Topic", value = "Probability") %>% 
  arrange(tweet_id)

## Topic counts by side
total_topic_by_side <- gathered_tweet_topics %>%
  mutate(has_topic = if_else(is.na(Probability), 0, 1)) %>%
  group_by(gun_control, Topic) %>% 
  summarize(num_tweets = sum(has_topic))

# Retweet by Topic
rts_topics_by_side <- gathered_tweet_topics %>%
  filter(!is.na(Probability)) %>% 
  group_by(gun_control, Topic) %>% 
  summarize(num_rts = sum(as.numeric(initial_retweets), na.rm = T))

frame_names <- c("Enforcement Failure",
                 "Mental Health",
                 "School Security",
                 "Gun Free Zones",
                 "Assault Weapons",
                 "Gun Control",
                 "Background Checks",
                 "Political Action")

## Total tweets and rts
totals <- left_join(total_topic_by_side, rts_topics_by_side,
                    by = c("gun_control", "Topic")) %>% 
  replace_na(list(num_rts = 0)) %>% 
  group_by(gun_control) %>% 
  summarise(total_tweets = sum(num_tweets),
            total_rts = sum(num_rts))

gc_total_tweets <- totals %>% 
  filter(gun_control == 1) %>% 
  select(total_tweets) %>% 
  pull()

gc_total_rts <- totals %>% 
  filter(gun_control == 1) %>% 
  select(total_rts) %>% 
  pull()

gr_total_tweets <- totals %>% 
  filter(gun_control == 0) %>% 
  select(total_tweets) %>% 
  pull()

gr_total_rts <- totals %>% 
  filter(gun_control == 0) %>% 
  select(total_rts) %>% 
  pull()

# Merge the two together
topic_side_all <- left_join(total_topic_by_side, rts_topics_by_side,
                            by = c("gun_control", "Topic")) %>% 
  replace_na(list(num_rts = 0)) %>% 
  mutate(frame = if_else(Topic %in% frame_names, "Frame", "Topic"),
         rt_tw_ratio = num_rts/num_tweets,
         tw_rt_ration = num_tweets/num_rts,
         gun_position_str = if_else(gun_control == 1, "Gun Control", "Gun Rights"))

# Reorder the frame factor
topic_side_all$frame <- fct_relevel(topic_side_all$frame, "Topic", "Frame")

# Subset GR/GC

gc_data <- topic_side_all %>% 
  filter(gun_control == 1 & !Topic %in% c("Pride Brunch",
                                          "meta_youth_leadership",
                                          "Incredulity")) %>% 
  mutate(pct_tweets = 100*(num_tweets/gc_total_tweets),
         pct_rts = 100*(num_rts/gc_total_rts),
         tweet_rt_pct_diff = pct_rts - pct_tweets)


## Gathering for easier combined bar plots
gc_data_gather <- gc_data %>% 
  select(Topic, gun_control, 
         pct_tweets, pct_rts,
         tweet_rt_pct_diff) %>% 
  gather(key = pct_type, value= pct, -Topic, -gun_control, -tweet_rt_pct_diff) %>% 
  mutate(pct_type = if_else(pct_type == "pct_rts", "Retweets", "Tweets"))


# Reorder the frame factor
gc_data_gather$pct_type <- fct_relevel(gc_data_gather$pct_type, "Retweets", "Tweets")

## Subset GR
gr_data <- topic_side_all %>% 
  filter(gun_control == 0 & !Topic %in% c("Pride Brunch",
                                          "meta_youth_leadership",
                                          "Incredulity")) %>% 
  mutate(pct_tweets = 100*(num_tweets/gc_total_tweets),
         pct_rts = 100*(num_rts/gc_total_rts),
         tweet_rt_pct_diff = pct_rts - pct_tweets)


## Gathering for easier combined bar plots
gr_data_gather <- gr_data %>% 
  select(Topic, gun_control, 
         pct_tweets, pct_rts,
         tweet_rt_pct_diff) %>% 
  gather(key = pct_type, value= pct, -Topic, -gun_control, -tweet_rt_pct_diff) %>% 
  mutate(pct_type = if_else(pct_type == "pct_rts", "Retweets", "Tweets"))

# Reorder the frame factor
gr_data_gather$pct_type <- fct_relevel(gr_data_gather$pct_type, "Retweets", "Tweets")


############# Initial figures
require(gridExtra)

### Figure showing percentages of total tweets/rts by side

### Showing percent of GC tweets/rts by topic
gc_tw_rct_pcts <- ggplot(data = gc_data_gather) +
  geom_bar(aes(x = reorder(Topic, tweet_rt_pct_diff),
               y = pct,
               fill = pct_type),
           stat = "identity",
           position = "dodge") +
  scale_fill_brewer(name="Tweets or\n Retweets?", palette="OrRd") +
  # Need to flip the order of the colors in the legend!
  guides(fill = guide_legend(reverse = TRUE)) +
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=18),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=18),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  xlab("Topic") +
  ylab("Percent of Total Gun Control\n Tweets and Retweets")

print(gc_tw_rct_pcts)

### Showing difference in percent of GC tweets/rts by topic
gc_diff_rt_tw_pcts <- ggplot(data = gc_data) +
  geom_bar(aes(x = reorder(Topic, tweet_rt_pct_diff),
               y = tweet_rt_pct_diff,
               fill = frame),
           stat = "identity") +
  scale_fill_brewer(name="Frame?", palette="PuRd") +
  guides(fill = guide_legend(reverse = TRUE)) +
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=18),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=18),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  # Remove the Topic labels
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  geom_hline(yintercept = 0, lty = "dashed") +
  xlab("Topic") +
  ylab("Difference in Percent of\n Total Retweets and Percent Tweets")

print(gc_diff_rt_tw_pcts)

# On one plot
pdf("results-figures/gc_pct_topics_tweets_rts_parkland.pdf", width = 12)
#png("results-figures/gc_topics_tweets_rts_parkland.png")
grid.arrange(gc_tw_rct_pcts, gc_diff_rt_tw_pcts, ncol=2, widths = c(2.4, 2))
dev.off()

#################

### Showing percent of GR tweets/rts by topic
gr_tw_rct_pcts <- ggplot(data = gr_data_gather) +
  geom_bar(aes(x = reorder(Topic, tweet_rt_pct_diff),
               y = pct,
               fill = pct_type),
           stat = "identity",
           position = "dodge") +
  scale_fill_brewer(name="Tweets or\n Retweets?", palette="OrRd") +
  # Need to flip the order of the colors in the legend!
  guides(fill = guide_legend(reverse = TRUE)) +
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=18),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=18),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  xlab("Topic") +
  ylab("Percent of Total Gun Rights \n Tweets and Retweets")

print(gr_tw_rct_pcts)

### Showing difference in percent of GC tweets/rts by topic
gr_diff_rt_tw_pcts <- ggplot(data = gr_data) +
  geom_bar(aes(x = reorder(Topic, tweet_rt_pct_diff),
               y = tweet_rt_pct_diff,
               fill = frame),
           stat = "identity") +
  scale_fill_brewer(name="Frame?", palette="PuRd") +
  guides(fill = guide_legend(reverse = TRUE)) +
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=18),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=18),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  # Remove the Topic labels
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  geom_hline(yintercept = 0, lty = "dashed") +
  xlab("Topic") +
  ylab("Difference in Percent of \n Total Retweets and Percent Tweets")

print(gr_diff_rt_tw_pcts)

# On one plot
pdf("results-figures/gr_pct_topics_tweets_rts_parkland.pdf", width = 12)
#png("results-figures/gc_topics_tweets_rts_parkland.png")
grid.arrange(gr_tw_rct_pcts, gr_diff_rt_tw_pcts, ncol=2, widths = c(2.4, 2))
dev.off()

######################
# Comparing number of tweets per topic per side
ggplot(data = filter(topic_side_all,
                     !Topic %in% c("Pride Brunch",
                                   "meta_youth_leadership",
                                   "Incredulity"))) +
  geom_bar(aes(x = reorder(Topic, num_tweets),
               y = num_tweets,
               fill = gun_position_str),
           stat = "identity",
           position = "dodge") +
  guides(fill = guide_legend(reverse = TRUE)) +
  coord_flip() +
  theme_classic() +
  scale_fill_brewer(name="Position", palette="BuPu") +
  theme(axis.title.x = element_text(size=16),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=16),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  xlab("Topic") +
  ylab("Number of Tweets")

ggsave("results-figures/num_tweets_by_topic_and_side.pdf", width = 9)

################################################
### First round figures
### Gun Control Orgs tweets and RTs by topic
gc_numtweets <- ggplot(data = filter(topic_side_all, gun_control == 1 &
                                       !Topic %in% c("Pride Brunch",
                                                     "meta_youth_leadership",
                                                     "Incredulity"))) +
  geom_bar(aes(x = Topic,
               y = num_tweets,
               fill = frame),
           stat = "identity")+
  #  scale_x_discrete(limits = rev(levels(Topic))) +
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=16),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=16),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  scale_fill_brewer(name="Frame?", palette="OrRd") +
  theme(legend.position="none") +
  xlab("Topic") +
  ylab("Number of Tweets")

print(gc_numtweets)

gc_rts <- ggplot(data = filter(topic_side_all, gun_control == 1 &
                                 !Topic %in% c("Pride Brunch",
                                               "meta_youth_leadership",
                                               "Incredulity"))) +
  geom_bar(aes(x = Topic,
               y = num_rts,
               fill = frame),
           stat = "identity")+
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=16),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=16),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  scale_fill_brewer(name="Frame?", palette="OrRd") +
  xlab("Topic") +
  ylab("Number of Retweets")

print(gc_rts)


gc_avg_rts <- ggplot(data = filter(topic_side_all, gun_control == 1 &
                                     !Topic %in% c("Pride Brunch",
                                                   "meta_youth_leadership",
                                                   "Incredulity"))) +
  geom_bar(aes(x = Topic,
               y = rt_tw_ratio,
               fill = frame),
           stat = "identity")+
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=16),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=16),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  scale_fill_brewer(name="Frame?", palette="OrRd") +
  xlab("Topic") +
  ylab("Average Retweets per Tweet")

print(gc_avg_rts)


# On one plot
pdf("results-figures/gc_topics_tweets_rts_parkland.pdf", width = 10.5)
#png("results-figures/gc_topics_tweets_rts_parkland.png")
grid.arrange(gc_numtweets, gc_rts, ncol=2, widths = c(2.3, 2))
dev.off()

######### Gun Rights version
### Gun Rights Orgs tweets and RTs by topic
gr_numtweets <- ggplot(data = filter(topic_side_all, gun_control == 0 &
                                       !Topic %in% c("Pride Brunch",
                                                     "meta_youth_leadership",
                                                     "Incredulity"))) +
  geom_bar(aes(x = Topic,
               y = num_tweets,
               fill = frame),
           stat = "identity")+
  #  scale_x_discrete(limits = rev(levels(Topic))) +
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=16),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=16),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  scale_fill_brewer(name="Frame?", palette="OrRd") +
  theme(legend.position="none") +
  xlab("Topic") +
  ylab("Number of Tweets")

print(gr_numtweets)

gr_rts <- ggplot(data = filter(topic_side_all, gun_control == 0 &
                                 !Topic %in% c("Pride Brunch",
                                               "meta_youth_leadership",
                                               "Incredulity"))) +
  geom_bar(aes(x = Topic,
               y = num_rts,
               fill = frame),
           stat = "identity")+
  #  scale_x_discrete(limits = rev(levels(Topic))) +
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=16),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=16),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  scale_fill_brewer(name="Frame?", palette="OrRd") +
  xlab("Topic") +
  ylab("Number of Retweets")

print(gr_rts)

gr_avg_rts <- ggplot(data = filter(topic_side_all, gun_control == 0 &
                                     !Topic %in% c("Pride Brunch",
                                                   "meta_youth_leadership",
                                                   "Incredulity"))) +
  geom_bar(aes(x = Topic,
               y = rt_tw_ratio,
               fill = frame),
           stat = "identity")+
  #  scale_x_discrete(limits = rev(levels(Topic))) +
  coord_flip() +
  theme_classic() +
  theme(axis.title.x = element_text(size=16),
        axis.text.x  = element_text(size=14),
        axis.title.y = element_text(size=16),
        axis.text.y  = element_text(size=14),
        legend.title = element_text(size=16),
        legend.text = element_text(size=14)) +
  theme(axis.title.y=element_blank(),
        #  axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  scale_fill_brewer(name="Frame?", palette="OrRd") +
  xlab("Topic") +
  ylab("Average Retweets per Tweet")

print(gr_avg_rts)

### Instead, what percentage of tweets were on that topic? 
# What percentage of retweets?

# On one plot
pdf("results-figures/gr_topics_tweets_rts_parkland.pdf", width = 10.5)
#png("results-figures/gr_topics_tweets_rts_parkland.png")
grid.arrange(gr_numtweets, gr_rts, ncol=2, widths = c(2.3, 2))
dev.off()


################
## Topic counts by org
total_topic_by_org <- gathered_tweet_topics %>% 
  mutate(has_topic = if_else(is.na(Probability), 0, 1)) %>%
  group_by(name, Topic) %>% 
  summarize(num_tweets = sum(has_topic))


# Retweet by Topic by org
rts_topics_by_org <- gathered_tweet_topics %>% 
  filter(!is.na(Probability)) %>% 
  group_by(name, Topic) %>% 
  summarize(num_rts  = sum(initial_retweets, na.rm = T))


## Daily, number of tweets per Topic
daily_topic_summary <- gathered_tweet_topics %>% 
  mutate(day_floor = floor_date(pretty_date, 'day'),
         has_topic = if_else(is.na(Probability), 0, 1)) %>% 
  group_by(gun_control, day_floor, Topic) %>% 
  summarize(num_tweets = sum(has_topic))

write_csv(daily_topic_summary, 'results-data/daily_topic_summary.csv')
#toJSON(daily_topic_summary, 'results-data/daily_topic_summary.json')

## Daily retweets by topic
daily_topic_rts <- gathered_tweet_topics %>% 
  filter(!is.na(Probability)) %>% 
  mutate(day_floor = as_date(floor_date(pretty_date, 'day')),
         gun_control = as.character(gun_control)) %>% 
  group_by(gun_control, day_floor, Topic) %>% 
  summarize(num_rts  = sum(initial_retweets, na.rm = T))

## Daily retweets with followers, too
daily_topic_rts_with_followers <- left_join(daily_topic_rts, followers_formatted, by = c("day_floor" = "tweet_dayfloor",
                                                                                         "gun_control"))


## Daily, number of tweets per Topic per organization
daily_org_topic_summary <- gathered_tweet_topics %>% 
  mutate(day_floor = as_date(floor_date(pretty_date, 'day')),
         has_topic = if_else(is.na(Probability), 0, 1)) %>% 
  group_by(name, day_floor, Topic) %>% 
  summarize(num_tweets = sum(has_topic))

write_csv(daily_org_topic_summary, 'results-data/daily_org_topic_summary.csv')

# Spreading the num tweets per topic
spread_daily_org_topic_summary <- daily_org_topic_summary %>% 
  mutate(Topic = str_c("num_tweets_", Topic),
         Topic = str_replace(Topic, " ", "_"),
         Topic = str_to_lower(Topic)) %>% 
  spread(Topic, num_tweets) 


# Daily, number of rts per topic per organization
daily_org_topic_rts <- gathered_tweet_topics %>% 
  filter(!is.na(Probability)) %>%
  mutate(day_floor = as_date(floor_date(pretty_date, 'day'))) %>% 
  group_by(name, day_floor, Topic) %>% 
  summarize(num_rts  = sum(initial_retweets, na.rm = T)) 

# Spread so org-day is the unit of analysis
spread_daily_org_topic_rts <- daily_org_topic_rts %>% 
  mutate(Topic = str_c("rts_", Topic),
         Topic = str_replace(Topic, " ", "_"),
         Topic = str_to_lower(Topic)) %>% 
  spread(Topic, num_rts) 


### Merge the number of daily org tweets with a topic to the number of RTs for that topic
daily_org_merge <- left_join(spread_daily_org_topic_summary, spread_daily_org_topic_rts,  
                             by = c("name", "day_floor"))

## Then merge to the number of account followers on that day
master_daily_org_data <- left_join(daily_org_merge, imputed_parkland, 
                                   by = c("name", "day_floor" = "tweet_dayfloor"))


write_csv(master_daily_org_data, "results-data/daily_org_topics_and_followers.csv")


